Load packages
library(data.table)
library(stringr)
library(ggplot2)
theme_py <- theme_light() + theme(
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_rect(colour = "black", fill = NA),
text = element_text(size=20),
strip.placement = "outside",
strip.text = element_text(size=20, color="black"),
strip.background = element_rect(fill="white")
)
theme_set(theme_py)
library(patchwork)
library(ggrepel)
library(ComplexHeatmap)
library(DESeq2)
library(consensusSeekeR)
library(BSgenome.jaNemVect1.1.DToL.Assembly)
library(GenomicRanges)
library(parallel)
library(universalmotif)
library(monaLisa)
library(rtracklayer)
source("../motif-analysis/mta_downstream_functions.R")
Directories and colors
dat_dir <- "ATACSEQ/nucleosome_free_regions/"
pks_dir <- file.path(dat_dir, "macs2_peaks")
cns_dir <- file.path(dat_dir, "consensus_peaks")
hom_dir <- file.path(dat_dir, "homer")
ftp_dir <- file.path(dat_dir, "footprint")
res_dir <- file.path(dat_dir, "results")
fig_dir <- file.path(dat_dir, "plots")
for (newdir in c(cns_dir, hom_dir, res_dir, fig_dir))
dir.create(newdir, showWarnings = FALSE)
# colors
condition_cols <- c("positive" = "blue", "negative" = "red")
line_cols = c("Elav" = "#ff7f00", "Fox" = "#984ea3", "Ncol" = "#4daf4a")
Find consensus set of peaks
require(consensusSeekeR)
require(BSgenome.jaNemVect1.1.DToL.Assembly)
require(parallel)
# load peaks
pks_files <- list.files(pks_dir, pattern="narrowPeak", recursive=FALSE, full.names=TRUE)
names(pks_files) <- str_remove(basename(pks_files), ".mLb.clN.ncfree_peaks.narrowPeak")
nP_list <- lapply(names(pks_files), function(x) {
nP <- readNarrowPeakFile(pks_files[x], extractRegions = TRUE, extractPeaks = TRUE)
names(nP$narrowPeak) <- rep(x, length(nP$narrowPeak))
names(nP$peak) <- rep(x, length(nP$peak))
nP
})
regions <- GenomicRanges::GRangesList(lapply(nP_list, function(x) x$narrowPeak))
peaks <- GenomicRanges::GRangesList(lapply(nP_list, function(x) x$peak))
names(regions) <- names(pks_files)
names(peaks) <- names(pks_files)
# get consensus
chrList <- Seqinfo(
seqnames = seqnames(BSgenome.jaNemVect1.1.DToL.Assembly),
seqlengths = seqlengths(BSgenome.jaNemVect1.1.DToL.Assembly),
isCircular = c(rep(FALSE, length(seqnames(BSgenome.jaNemVect1.1.DToL.Assembly))-1), TRUE),
genome = "jaNemVect1.1"
)
message(Sys.time(), " Started calculating consensus")
ur <- unlist(regions)
up <- unlist(peaks)
# debugging
# id <- seqnames(ur)=="NC_064035.1" & start(ur)>1644829 & end(ur)<1649211
# ur <- ur[id]
# up <- up[id]
# chrList <- chrList["NC_064035.1"]
results <- findConsensusPeakRegions(
narrowPeaks = ur,
peaks = up,
chrInfo = chrList,
extendingSize = 250,
expandToFitPeakRegion = FALSE,
shrinkToFitPeakRegion = FALSE,
minNbrExp = 2,
nbrThreads = detectCores()-1
)
message(Sys.time(), " Done calculating consensus")
saveRDS(results, file.path(cns_dir,"consensusSeekeR-results.RDS"))
# resize peaks
pks_cns <- results$consensusRanges
pks_mid <- start(pks_cns) + (end(pks_cns)-start(pks_cns))/2
ranges(pks_cns) <- IRanges(pks_mid,width=0)
pks_scl <- promoters(pks_cns, upstream=125, downstream=125)
# trim out-of-bound peaks
seqlengths(pks_scl) <- seqlengths(BSgenome.jaNemVect1.1.DToL.Assembly)[names(seqlengths(pks_scl))]
pks_scl <- trim(pks_scl)
# save bed file
pks_bed <- as.data.table(pks_scl)
pks_bed[,name:=paste0("peak",1:.N)]
pks_bed[,width:=NULL][,score:="."]
setcolorder(pks_bed, c("seqnames", "start", "end", "name", "score", "strand"))
fwrite(pks_bed, file.path(cns_dir,"consensusSeekeR-peaks.bed"), sep="\t", col.names=FALSE)
Get scores for consensus peaks in all samples.
nth=12
res="ATACSEQ/nucleosome_free_regions/consensus_peaks/consensusSeekeR-peaks-counts.tsv"
bed="ATACSEQ/nucleosome_free_regions/consensus_peaks/consensusSeekeR-peaks.bed"
bam=$( echo ATACSEQ/nucleosome_free_regions/bam/*R*bam )
cols=$( echo -e seqnames start end name score strand | column -t )
for b in $bam
do
echo $b
name=$( basename $b)
cols=$( echo -e ${cols} ${name} | column -t )
done
echo -e ${cols} > ${res%%tsv}txt
bedtools multicov -bams ${bam} -bed ${bed} > ${res}
Data for differential peaks analysis
# load counts in peaks
pks_ct <- fread(file.path(cns_dir, "consensusSeekeR-peaks-counts.tsv"), header=FALSE)
colnames <- readLines(file.path(cns_dir, "consensusSeekeR-peaks-counts.txt"), n=1)
colnames <- str_remove_all(colnames, ".mLb.clN.ncfree.sorted.bam")
colnames <- str_split(colnames, " ")[[1]]
colnames(pks_ct) <- colnames
# column data
col_dt <- data.table(sample = colnames(pks_ct)[7:28])
col_dt[,reporterline := str_extract(sample,"Elav|Fox|Ncol")]
col_dt[,reporterline := factor(reporterline, levels=c("Elav","Fox","Ncol"))]
col_dt[,condition := str_extract(sample,"pos|neg")]
col_dt[,condition := str_replace_all(condition,c("pos"="positive","neg"="negative"))]
col_dt[,condition := factor(condition, levels = c("negative","positive"))]
col_dt[,group := paste(as.character(reporterline), as.character(condition), sep=""), by=1:nrow(col_dt)]
col_dt[,group := factor(group)]
fwrite(col_dt, file.path(res_dir, "design.tsv"), sep="\t")
col_df <- copy(col_dt)
class(col_df) <- "data.frame"
rownames(col_df) <- col_df$sample
# counts matrix
pks_mt <- as.matrix(pks_ct[, -c(1:6)])
rownames(pks_mt) <- pks_ct$name
pks_mt <- pks_mt[, rownames(col_df)]
write.table(
pks_mt,
file.path(res_dir, "mat.tsv"),
sep = "\t",
quote = FALSE
)
# DESeq2
require(DESeq2)
dds <- DESeqDataSetFromMatrix(
countData = pks_mt,
colData = col_df,
design = ~ condition + reporterline
)
saveRDS(dds, file.path(res_dir, "dds.rds"))
Peaks counts distributions
# normalized accessibility distribution
pks_dt <- as.data.table(pks_mt, keep.rownames = "peak")
pks_dt <- melt.data.table(
pks_dt,
id.vars = "peak",
variable.name = "sample",
value.name = "norm_counts"
)
pks_dt <- merge.data.table(pks_dt, col_dt, by="sample")
setorder(pks_dt, peak, condition, reporterline)
pks_dt[,sample:=factor(sample, levels = unique(pks_dt$sample))]
gp_acc <- ggplot(pks_dt, aes(sample, log10(norm_counts), fill=reporterline)) +
geom_violin(scale = "width", alpha = 0.8, color = "black") +
geom_boxplot(width = 0.5, outlier.shape = NA, alpha = 0.8, color = "black") +
scale_fill_manual(values = line_cols, limits = force) +
scale_y_continuous(expand = expansion(mult = c(0,0.01))) +
labs(x="samples", y="peak\naccessibility") +
theme(
axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1),
legend.title = element_blank(), legend.position = "none"
)
var_dt <- pks_dt[,.(var=var(norm_counts)),.(peak)]
gp_var <- ggplot(var_dt, aes(log10(var))) +
geom_density() +
scale_x_continuous(limits=c(-4,NA)) +
scale_y_continuous(expand = expansion(mult = c(0,0.01))) +
labs(x = "log10(accessibility variance)")
gp_pch <- gp_acc / gp_var + plot_layout(heights = c(2,1))
gp_pch
Use normalized log-transformed accessibility data
# normalize samples
dds <- readRDS(file.path(res_dir, "dds.rds"))
dds <- estimateSizeFactors(dds)
norm_mt <- counts(dds, normalized=TRUE)
# save
write.table(
norm_mt,
file.path(res_dir, "mat_norm.tsv"),
sep="\t",
row.names = TRUE,
quote = FALSE
)
# row normalize to bring peaks to same range
norm_mt <- (norm_mt+10)/apply(norm_mt+10,1,median)
norm_mt <- log2(norm_mt)
# save
write.table(
norm_mt,
file.path(res_dir, "mat_norm_fc.tsv"),
sep="\t",
row.names = TRUE,
quote = FALSE
)
PCA on all samples.
set.seed(1950)
pca_res <- prcomp(t(norm_mt), center = TRUE)
# variance explained
pca_var <- data.table(pct_var = round(((pca_res$sdev) ^ 2 / sum((pca_res$sdev) ^ 2)* 100), 2))
pca_var[,pct_cum:=cumsum(pct_var)]
pca_var[,PC:=factor(1:.N)]
gp_var <- ggplot(pca_var, aes(PC, pct_var)) +
geom_bar(stat = "identity") +
geom_line(aes(y = pct_cum, group = 1)) +
geom_point(aes(y = pct_cum)) +
scale_y_continuous(expand = expansion(0.01,0)) +
labs(y = "% of variance\nexplained", x = "PC") +
theme(panel.grid.major.y = element_line(size = 0.5))
pca_dt <- as.data.table(pca_res$x, keep.rownames = "sample")
pca_dt <- merge.data.table(col_dt, pca_dt, by="sample", sort=FALSE)
gp_bip <- ggplot(pca_dt, aes(PC1, PC2, fill=reporterline, shape=condition)) +
geom_point(size=5) +
scale_fill_manual(values = line_cols) +
scale_shape_manual(values = c("positive" = 21, "negative" = 24)) +
guides(fill = guide_legend(override.aes=list(shape=21))) +
geom_text_repel(aes(label = sample))
gp_pch <- gp_var / gp_bip
gp_pch
PCA per cell line
set.seed(1950)
gp_l <- lapply(names(line_cols), function(cl) {
pca_res <- prcomp(t(norm_mt[,grep(cl,colnames(norm_mt))]), center = TRUE)
# variance explained
pca_var <- data.table(pct_var = round(((pca_res$sdev) ^ 2 / sum((pca_res$sdev) ^ 2)* 100), 2))
pca_var <- pca_var[-nrow(pca_var)]
pca_var[,pct_cum:=cumsum(pct_var)]
pca_var[,PC:=factor(1:.N-1)]
gp_var <- ggplot(pca_var, aes(PC, pct_var)) +
geom_bar(stat = "identity") +
geom_line(aes(y = pct_cum, group = 1)) +
geom_point(aes(y = pct_cum)) +
scale_y_continuous(expand = expansion(0.01,0)) +
labs(y = "% of variance\nexplained", x = "PC") +
theme(panel.grid.major.y = element_line(size = 0.5))
# pca plot
pca_dt <- as.data.table(pca_res$x, keep.rownames = "sample")
pca_dt <- merge.data.table(col_dt, pca_dt, by="sample", sort=FALSE)
gp_bip <- ggplot(pca_dt, aes(PC1, PC2, fill=condition, shape=condition)) +
geom_point(size=5) +
scale_shape_manual(values = c("positive" = 21, "negative" = 24)) +
scale_fill_manual(values = condition_cols) +
guides(fill = guide_legend(override.aes=list(shape=21))) +
geom_text_repel(aes(label = sample))
gp_var / gp_bip
})
gp_l
## [[1]]
##
## [[2]]
##
## [[3]]
Use normalized peak counts
norm_mt <- read.table(
file.path(res_dir, "mat_norm_fc.tsv"),
header = TRUE
)
Identify marker peaks
# gene markers by high FC + significant DEseq2 LTR test
dds <- readRDS(file.path(res_dir, "dds.rds"))
dds <- DESeq(dds, test="LRT", reduced=~1)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
dds_res <- results(dds)
dds_qval <- dds_res$padj
names(dds_qval) <- rownames(dds_res)
peaks_deseq <- names(which(dds_qval<1e-2))
peaks_high <- names(which(apply(
norm_mt, 1, function(x) sort(x, decreasing = TRUE)[2] >= 1.8
)))
peaks_marks <- intersect(peaks_high, peaks_deseq)
length(peaks_marks)
## [1] 2844
Cluster peaks
set.seed(1950)
# determine k for kmeans
ks <- 1:30
tot_withinss <- sapply(ks, function(k) {
cl <- kmeans(norm_mt[peaks_marks,], k)
cl$tot.withinss
})
elbow_dt <- data.table(k = ks, tot_withinss = tot_withinss)
elbow_gp <- ggplot(elbow_dt, aes(x = k, y = tot_withinss)) +
geom_line() + geom_point()+
scale_x_continuous(breaks = ks)
elbow_gp
Cluster peaks
# kmeans
set.seed(1950)
k <- 19
cl <- kmeans(norm_mt[peaks_marks,], k)
peaks_order_list <- tapply(names(cl$cluster), cl$cluster, function(gs) {
cor_peaks <- cor(t(norm_mt[gs,]))
hclust_peaks <- hclust(as.dist(
1 - cor(cor_peaks)),
method = "ward.D2"
)
rownames(cor_peaks)[hclust_peaks$order]
})
names(peaks_order_list) <- unique(cl$cluster)
peaks_order_list <- peaks_order_list[as.character(seq_along(peaks_order_list))]
# cluster clusters
cluster_order <- hclust(dist(
cor(t(cl$centers)), method = "euclidean"
), method = "ward.D2")$order
peaks_order_list <- peaks_order_list[as.character(cluster_order)]
peaks_order <- unname(unlist(peaks_order_list))
clusters_dt <- data.table(
peak = peaks_order,
clusters = as.character(rep(
names(peaks_order_list),
sapply(peaks_order_list, length)
))
)
# group clusters (manually)
clusters_dt[clusters %in% c(16), group :=1 ] # Elav
clusters_dt[clusters %in% c(5), group :=2 ] # Elav + Fox
clusters_dt[clusters %in% c(14,17), group := 3] # Fox
clusters_dt[clusters %in% c(6,11), group := 4] # Fox + Ncol
clusters_dt[clusters %in% c(9), group := 5]
clusters_dt[clusters %in% c(13,4), group := 6] # Elav + Ncol
clusters_dt[clusters %in% c(1,2,15,7,8,10,19,18), group := 7] # Ncol
clusters_dt[clusters %in% c(12), group := 8]
clusters_dt[clusters %in% c(3), group := 9]
setorder(clusters_dt, group)
peaks_order <- clusters_dt$peak
# add clusters info to peaks coordinates bed file
pks_bed <- fread(file.path(cns_dir, "consensusSeekeR-peaks.bed"))
setnames(pks_bed, c("V4"), c("peak"))
clusters_bed <- merge.data.table(
pks_bed, clusters_dt,
by = "peak",
sort = FALSE
)
setcolorder(clusters_bed, c(colnames(pks_bed)))
clusters_bed[, clusters := paste0("C", clusters)]
clusters_bed[, group := paste0("G", group)]
fwrite(
clusters_bed,
file.path(res_dir, "consensusSeekeR-peaks-clusters.bed"),
sep = "\t",
col.names = FALSE
)
Heatmap of markers
# order rows and columns
samples_order <- col_dt[order(condition,reporterline)]$sample
plot_mt <- norm_mt[peaks_order,samples_order]
# center at 0
plot_min <- quantile(abs(range(plot_mt)),0.75)
plot_min <- 5
plot_mt <- pmin(pmax(plot_mt,-plot_min),plot_min)
# heatmap colors
col_vec <- colorRampPalette(
RColorBrewer::brewer.pal(11,'BrBG')
)(1000)
col_fun <- circlize::colorRamp2(
seq(-plot_min, plot_min, length.out = length(col_vec)),
col_vec
)
# color annotaitons
col_ann <- HeatmapAnnotation(
which = "column", border = TRUE,
"reporterline" = as.character(
col_dt[match(colnames(plot_mt),sample)]$reporterline
),
"condition" = as.character(
col_dt[match(colnames(plot_mt),sample)]$condition
),
col = list(
"reporterline" = line_cols,
"condition" = condition_cols
)
)
# # peak module annotations (clusters)
# clann <- clusters_dt[match(rownames(plot_mt),peak)]$clusters
# clann_lab <- unique(clusters_dt[match(rownames(plot_mt),peak)]$clusters)
# clann <- factor(clann, levels=clann_lab)
# module_ann <- HeatmapAnnotation(
# which = "row", border = TRUE,
# "cluster" = anno_block(
# labels = clann_lab,
# gp = gpar(col=NA)
# )
# )
# row_split <- clann
# peak module annotations (manually groupped clusters)
grann <- clusters_dt[match(rownames(plot_mt),peak)]$group
grann_lab <- unique(clusters_dt[match(rownames(plot_mt),peak)]$group)
grann <- factor(grann, levels=grann_lab)
module_ann <- HeatmapAnnotation(
which = "row", border = TRUE,
"group" = anno_block(
labels = grann_lab,
gp = gpar(col=NA)
)
)
row_split <- grann
# peaks annotations
row_labels_marks_ids <- match(
clusters_dt[,.SD[1],clusters]$peak,
rownames(plot_mt)
)
row_labels_marks <- clusters_dt[,.SD[1], clusters]$clusters
mark_ann <- HeatmapAnnotation(
which = "row",
marker = anno_mark(at = row_labels_marks_ids, labels = row_labels_marks),
show_legend = FALSE
)
mark_ann <- HeatmapAnnotation(
which = "row", border = TRUE,
"padj<0.05" = rownames(plot_mt) %in% peaks_deseq,
"var>1" = rownames(plot_mt) %in% peaks_vari,
"FC>2" = rownames(plot_mt) %in% peaks_fc,
col = list(
"padj<0.05" = c("TRUE" = "#e6ab02", "FALSE"="#d9d9d9"),
"var>1" = c("TRUE" = "#3690c0", "FALSE"="#d9d9d9"),
"FC>2" = c("TRUE" = "#e7298a", "FALSE"="#d9d9d9")
)
)
# heatmap
hm <- Heatmap(
plot_mt, name = "normalized\naccessibility",
col = col_fun,
cluster_rows = FALSE, cluster_columns = FALSE,
show_row_names = FALSE, show_column_names = TRUE,
row_title = "peaks",
row_split = row_split,
cluster_row_slices = FALSE,
top_annotation = col_ann,
left_annotation = module_ann, right_annotation = mark_ann,
border = TRUE
)
hm
Load data
# peaks
peaks <- fread(file.path(cns_dir, "consensusSeekeR-peaks.bed"))
setnames(peaks, c("seqnames", "start", "end", "name", "score", "strand"))
peaks_gr <- makeGRangesFromDataFrame(peaks, keep.extra.columns = TRUE)
# genes
genes_gr <- rtracklayer::import(
"annotation/Nvec_v4_merged_annotation_sort.gtf",
"gtf"
)
genes_gr <- genes_gr[genes_gr$type == "transcript"]
genes_gr$name <- genes_gr$transcript_id
# non-expressed genes to remove
counts <- read.table(
"RNASEQ_QUANTIFICATION/raw_counts_rnaseq.tsv",
header = TRUE,
row.names = 1
)
exclude_genes <- names(which(
apply(counts, 1, function(x) sum(x) < 10)
))
length(exclude_genes) # 437
## [1] 437
Assign peaks to genes
# assign
assign <- mta_match_peaks_to_genes(
gff_object = genes_gr,
peak_object = peaks_gr,
index_object = "genome/Nvec_vc1.1_gDNA.fasta.fai",
list_genes = NULL,
feature_to_match = "transcript",
feature_field = "name",
exclude_genes = NULL,
max_tss_dist = 10000,
min_overlap = 0,
max_gap = 1,
promoter_upstream = 200,
promoter_downstream = 50,
promoter_object = NULL
)
setDT(assign)
setnames(assign, "chr", "seqnames")
assign[, seqnames := factor(seqnames, levels = unique(peaks$seqnames))]
setorder(assign, seqnames, start, end)
# save
fwrite(
assign,
file.path(res_dir, "consensusSeekeR-peaks-gene-assignment.tsv"),
sep = "\t",
col.names = TRUE
)
Inspect peak assignment results
assign <- fread(file.path(res_dir, "consensusSeekeR-peaks-gene-assignment.tsv"))
max_assign_peaks_per_gene <- unique(
assign[, .(peak,gene)][!is.na(gene)]
)[, .N, gene]
gb1 <- ggplot(max_assign_peaks_per_gene, aes(N)) +
geom_bar() +
labs(x = "peaks per gene", y = "genes") +
scale_y_continuous(expand = expansion(mult = c(0, 0.01))) +
scale_x_continuous(
breaks = c(1, seq(5, max(max_assign_peaks_per_gene$N), 5))
)
max_assign_gene_per_peak <- unique(
assign[, .(peak,gene)][!is.na(gene)]
)[, .N, peak]
gb2 <- ggplot(max_assign_gene_per_peak, aes(N)) +
geom_bar() +
labs(x = "genes per peak", y = "peaks") +
scale_y_continuous(
expand = expansion(mult = c(0, 0.01)),
labels = scales::label_number(scale = 0.001, suffix = "K")
) +
scale_x_continuous(breaks = seq(1, max(max_assign_peaks_per_gene$N)))
gb1 + gb2
Calculate gene accessibility scores as weighted sum of normalized peak scores assigned to genes.
assign <- fread(file.path(res_dir, "consensusSeekeR-peaks-gene-assignment.tsv"))
setnames(assign, "seqnames", "chr")
genes_gff <- genes_gr#[!genes_gr$name %in% exclude_genes]
genes_gff$symbol <- genes_gff$name
peaks_mt <- read.table(file.path(res_dir, "mat_norm.tsv"))
# cells_groups <- fread(file.path(res_dir, "design.tsv"))[, .(sample, group)]
# setnames(cells_groups, "sample", "cell")
gscore <- mta_gene_scores(
genes_peaks_table = assign,
gff_object = genes_gff,
peak_object = peaks_gr,
peaks_mat = peaks_mt
)
saveRDS(gscore, file.path(res_dir, "consensusSeekeR-gene-scores.rds"))
write.table(
gscore$genes_scores_matrix,
file.path(res_dir, "consensusSeekeR-gene-scores.tsv"),
sep = "\t",
col.names = TRUE
)
For DESeq2, calculate scores as sums of raw peak counts (they have to be raw counts and integers).
assign <- fread(file.path(res_dir, "consensusSeekeR-peaks-gene-assignment.tsv"))
setnames(assign, "seqnames", "chr")
genes_gff <- genes_gr#[!genes_gr$name %in% exclude_genes]
genes_gff$symbol <- genes_gff$name
peaks_mt <- read.table(file.path(res_dir, "mat.tsv"))
gscore <- mta_gene_scores(
genes_peaks_table = assign,
gff_object = genes_gff,
peak_object = peaks_gr,
peaks_mat = peaks_mt,
weight_peaks = FALSE
)
saveRDS(gscore, file.path(res_dir, "consensusSeekeR-gene-scores-raw.rds"))
write.table(
gscore$genes_scores_matrix,
file.path(res_dir, "consensusSeekeR-gene-scores-raw.tsv"),
sep = "\t",
col.names = TRUE
)
Use normalized gene scores
# load gene scores
norm_mt <- read.table(
file.path(res_dir, "consensusSeekeR-gene-scores.tsv"),
header = TRUE
)
# row normalize to bring genes to same range
norm_mt <- (norm_mt + 10) / apply(norm_mt + 10, 1, median)
norm_mt <- log2(norm_mt)
Identify marker genes (here use raw gene scores for DESeq2)
# gene markers by normalized expression
genes_fc <- names(which(apply(norm_mt, 1, function(x)
sort(x, decreasing = TRUE)[1] >= 2
)))
genes_vari <- names(which(apply(norm_mt, 1, function(x)
var(x) > 1
)))
con_mt <- read.table(
file.path(res_dir, "consensusSeekeR-gene-scores-raw.tsv"),
)
# gene markers by high FC + significant DEseq2 LTR test
dds <- DESeqDataSetFromMatrix(
countData = con_mt,
colData = col_df,
design = ~ condition + reporterline + condition:reporterline
)
dds <- DESeq(dds, test = "LRT", reduced = ~ 1)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
dds_res <- results(dds)
dds_qval <- dds_res$padj
names(dds_qval) <- rownames(dds_res)
genes_deseq <- names(which(dds_qval < 1e-2))
genes_high <- names(which(apply(norm_mt, 1, function (x)
sort(x, decreasing = TRUE)[1] > 1.8
)))
gene_marks <- intersect(genes_high, genes_deseq)
# inclusde golden markers
marks_gold <- fread(
file.path("annotation", "golden-marks-230116.tsv"),
fill = TRUE
)[, 1:2]
setnames(marks_gold, c("gene", "name"))
marks_gold[, name := str_remove(name, "^Nv")]
gene_marks <- unique(c(gene_marks, marks_gold[gene %in% genes_deseq]$gene))
Select number of clusters for genes
set.seed(1950)
# determine k for kmeans
ks <- 1:30
tot_withinss <- sapply(ks, function(k) {
cl <- kmeans(norm_mt[gene_marks, ], k)
cl$tot.withinss
})
elbow_dt <- data.table(k = ks, tot_withinss = tot_withinss)
elbow_gp <- ggplot(elbow_dt, aes(x = k, y = tot_withinss)) +
geom_line() + geom_point() +
scale_x_continuous(breaks = ks) +
theme(panel.grid.major = element_line(size = 0.5))
elbow_gp
Cluster genes
set.seed(1950)
# kmeans
k <- 18
cl <- kmeans(norm_mt[gene_marks, ], k)
gene_order_list <- tapply(names(cl$cluster), cl$cluster, function(gs) {
cor_genes <- cor(t(norm_mt[gs,]))
hclust_genes <- hclust(as.dist(1 - cor(cor_genes)), method = "ward.D2")
rownames(cor_genes)[hclust_genes$order]
})
names(gene_order_list) <- unique(cl$cluster)
gene_order_list <- gene_order_list[as.character(seq_along(gene_order_list))]
# cluster clusters
cluster_order <- hclust(
dist(cor(t(cl$centers)),
method = "euclidean"),
method = "ward.D2"
)$order
gene_order_list <- gene_order_list[as.character(cluster_order)]
gene_order <- unname(unlist(gene_order_list[cluster_order]))
clusters_dt <- data.table(
gene = unlist(gene_order_list),
clusters = as.character(
rep(names(gene_order_list), sapply(gene_order_list, length))
)
)
# group clusters (manually)
clusters_dt[clusters %in% c(8, 15), group := 1] # Elav
clusters_dt[clusters %in% c(1, 4), group := 2] # Elav + Fox
clusters_dt[clusters %in% c(16, 2), group := 3] # Fox
clusters_dt[clusters %in% c(3), group := 4] # Fox + Ncol
clusters_dt[clusters %in% c(13), group := 5] # Fox + Ncol + Elav
clusters_dt[clusters %in% c(17, 5), group := 6] # Elav + Ncol
clusters_dt[clusters %in% c(7, 14, 6, 10, 11, 9), group := 7] # Ncol
clusters_dt[clusters %in% c(12), group := 8]
clusters_dt[clusters %in% c(18), group := 9]
setorder(clusters_dt, group)
gene_order <- clusters_dt$gene
# save
fwrite(
clusters_dt,
file.path(res_dir, "genes_clusters.tsv"),
sep = "\t"
)
Heatmap of markers
# order rows and columns
col_dt <- fread(file.path(res_dir, "design.tsv"))
samples_order <- col_dt[order(condition, reporterline)]$sample
plot_mt <- norm_mt[gene_order, samples_order]
# center at 0
plot_min <- quantile(abs(range(plot_mt)), 0.75)
plot_min <- 4
plot_mt <- pmin(pmax(plot_mt, -plot_min), plot_min)
# heatmap colors
col_vec <- colorRampPalette(RColorBrewer::brewer.pal(11, 'BrBG'))(1000)
col_fun <- circlize::colorRamp2(
seq(-plot_min, plot_min, length.out = length(col_vec)),
col_vec
)
# color annotations
col_ann <- HeatmapAnnotation(
which = "column",
border = TRUE,
"reporterline" = as.character(
col_dt[match(colnames(plot_mt), sample)]$reporterline
),
"condition" = as.character(
col_dt[match(colnames(plot_mt), sample)]$condition
),
col = list("reporterline" = line_cols, "condition" = condition_cols)
)
# gene module annotations
clann <- clusters_dt[match(rownames(plot_mt), gene)]$clusters
clann_lab <- unique(clusters_dt[match(rownames(plot_mt), gene)]$clusters)
clann <- factor(clann, levels = clann_lab)
module_ann <- HeatmapAnnotation(
which = "row", border = TRUE,
"cluster" = anno_block(
labels = clann_lab,
gp = gpar(col = NA)
)
)
row_split <- clann
# gene module annotations (manual groups)
grann <- clusters_dt[match(rownames(plot_mt), gene)]$group
grann_lab <- unique(clusters_dt[match(rownames(plot_mt), gene)]$group)
grann <- factor(grann, levels = grann_lab)
module_ann <- HeatmapAnnotation(
which = "row", border = TRUE,
"group" = anno_block(
labels = grann_lab,
gp = gpar(col = NA)
)
)
row_split <- grann
# genes annotations
marks_gold <- fread(
file.path("annotation", "golden-marks-230116.tsv"),
fill = TRUE
)[, 1:2]
setnames(marks_gold, c("gene", "name"))
marks_gold[, name := str_remove(name, "^Nv")]
tfs_annt <- fread(
file.path("annotation", "curated_TFh_Nvec_DToL_names.tsv"),
header = FALSE
)
setnames(tfs_annt, c("gene", "pfam", "og"))
marks_tfs <- merge.data.table(
tfs_annt,
marks_gold,
by = "gene", all = TRUE, sort = FALSE
)
marks_tfs[is.na(marks_tfs)] <- ""
marks_tfs <- marks_tfs[gene %in% rownames(plot_mt)]
marks_tfs[
gene %in% clusters_dt[group==5]$gene,
name := str_replace_all(og, c(
"HMGbox_Sox.HG1.25:like:BHMG1/SOX1/SOX2/SOX3/SOX14/SOX15/SOX21/SRY:likeclu:18/26/28" = "SOX1/2/3/14/15/21-like",
"zf-C2H2.HG5.17:PRDM6" = "zf-C2H2 PRDM6",
"zf-C2H2.Unclassified" = ""
))
]
marks_tfs <- marks_tfs[name != ""]
row_labels_marks_ids <- match(marks_tfs$gene, rownames(plot_mt))
row_labels_marks <- marks_tfs[
match(rownames(plot_mt)[row_labels_marks_ids], gene)
]$name
mark_ann <- HeatmapAnnotation(
which = "row", border = TRUE,
"padj<0.01" = rownames(plot_mt) %in% genes_deseq,
"var>1" = rownames(plot_mt) %in% genes_vari,
"FC>2" = rownames(plot_mt) %in% genes_fc,
"marker" = anno_mark(
at = row_labels_marks_ids,
labels = row_labels_marks
),
col = list(
"padj<0.01" = c("TRUE" = "#e6ab02", "FALSE"="#d9d9d9"),
"var>1" = c("TRUE" = "#3690c0", "FALSE"="#d9d9d9"),
"FC>2" = c("TRUE" = "#e7298a", "FALSE"="#d9d9d9")
)
)
# heatmap
hm <- Heatmap(
plot_mt, name = "normalized\naccessibility",
col = col_fun,
cluster_rows = FALSE, cluster_columns = FALSE,
show_row_names = FALSE, show_column_names = TRUE,
row_title = "genes",
row_split = row_split,
cluster_row_slices = FALSE,
top_annotation = col_ann,
right_annotation = mark_ann,
left_annotation = module_ann,
border = TRUE
)
hm
Save per-group foreground and background peaks for motif discovery
dir.create(file.path(hom_dir, "peaks"))
dir.create(file.path(hom_dir, "results"))
clusters_bed <- fread(file.path(res_dir, "consensusSeekeR-peaks-clusters.bed"))
setnames(clusters_bed, c("seqnames","start","end","name","score","strand","cluster","group"))
clusters_bed[,group:=factor(group, levels=paste0("G",seq_along(unique(group))))]
for (ci in levels(clusters_bed$group)) {
fg <- clusters_bed[group==ci]
bg <- clusters_bed[group!=ci]
fwrite(fg, file.path(hom_dir, "peaks",sprintf("peaks-%s-fg.bed",ci)), col.names=FALSE, sep="\t")
fwrite(bg, file.path(hom_dir, "peaks",sprintf("peaks-%s-bg.bed",ci)), col.names=FALSE, sep="\t")
}
De novo and known motifs in groups of accessible peaks
findMotifs(){
genome="genome/Nvec_vc1.1_gDNA.fasta"
homdir="ATACSEQ/nucleosome_free_regions/homer"
bedfg=${homdir}"/peaks/peaks-G"${1}"-fg.bed"
bedbg=${homdir}"/peaks/peaks-G"${1}"-bg.bed"
outdir=$homdir"/results/G${1}-Homer"
echo ""
echo "Starting HOMER analysis for" $1
echo "using the intervals in" $bedfg
echo "Output will be saved to" $outdir
findMotifsGenome.pl $bedfg $genome $outdir -size 250 -len 6,8,10,12 -bg $bedbg
echo ""
echo "Finished de novo analysis for module" $1
}
for cluster in {1..9}
do
findMotifs "$cluster" &
done
wait
echo "Done."
Parse de novo Homer results
dir="./ATACSEQ/nucleosome_free_regions/homer/results/"
modules=$( find ${dir} -maxdepth 2 -iname "*-Homer" -printf "%f\n" )
for i in $modules
do
less ${dir}/${i}/homerMotifs.all.motifs | grep ">" > tmp.txt
awk -v m="$M" 'BEGIN {FS="\t"} {OFS="\t"} {print m, $0}' tmp.txt >> ${dir}/${i}/homerResults.txt
done
Parse all Homer results
clusters_bed <- fread(file.path(res_dir, "consensusSeekeR-peaks-clusters.bed"))
setnames(
clusters_bed,
c("seqnames", "start", "end", "name", "score", "strand", "cluster", "group")
)
clusters_bed[,group:=factor(group, levels=paste0("G",seq_along(unique(group))))]
ParseHomerDenovo <- function(fn, eta=1) {
# parse homerResults.txt
dt <- fread(fn, sep="\t", header=FALSE)[,-1]
setnames(dt,c("consensus","name","logodds_threshold","logpval","0","occurence","stats"))
dt[,consensus:=stringr::str_remove(consensus,">")]
dt[,c("fg","bg","pval"):=tstrsplit(occurence,",")]
dt_occurence <- dt[,lapply(.SD,stringr::str_remove,pattern="[TBP]:"),.SDcols=c("fg","bg","pval")]
dt_perc <- dt_occurence[,lapply(.SD,function(x) as.numeric(stringr::str_extract(x,"(?<=\\()\\d+\\.*\\d+"))/100),.SDcols=c("fg","bg")]
setnames(dt_perc,c("fg","bg"),c("fg_perc","bg_perc"))
dt_counts <- dt_occurence[,lapply(.SD,function(x) as.numeric(stringr::str_extract(x,"\\d+\\.*\\d*"))),.SDcols=c("fg","bg")]
setnames(dt_counts,c("fg","bg"),c("fg_count","bg_count"))
dt_occurence[,c("fg","bg"):=NULL]
dt_occurence[,pval:=as.numeric(pval)]
dt[,c("fg","bg","pval","occurence"):=NULL]
dt <- cbind(dt,dt_perc,dt_counts,dt_occurence)
dt$qval <- p.adjust(dt$pval, method = "BH")
dt[,fc:=(fg_count+eta)/(bg_count+eta)]
# parse homerResults folder to get names of best hits for denovo motifs
hd <- str_remove(fn, ".txt")
ms <- list.files(hd, pattern = "motif\\d+.motif", full.names=TRUE)
ml <- lapply(ms, function(x) {
m <- universalmotif::read_homer(x)
m@name
})
ml <- unlist(unname(ml))
names(ml) <- str_extract(ml, ".+(?=\\,BestGuess)")
names(ml) <- ml
dt[,name2:=ml[name]]
dt[!is.na(name2), name:=name2]
dt[,name2:=NULL]
# return
dt[,.(name,consensus,fc,pval,logpval,qval,fg_count,fg_perc,bg_count,bg_perc)]
}
ParseHomerKnown <- function(fn, eta=1) {
dt <- fread(fn)
setnames(dt, c("name","consensus","pval","logpval","qval","fg_count","fg_perc","bg_count","bg_perc"))
dt[,fg_perc:=as.numeric(stringr::str_remove(fg_perc,"\\%"))/100]
dt[,bg_perc:=as.numeric(stringr::str_remove(bg_perc,"\\%"))/100]
dt[,fc:=(fg_count+eta)/(bg_count+eta)]
dt[,.(name,consensus,fc,pval,logpval,qval,fg_count,fg_perc,bg_count,bg_perc)]
}
hm_list <- lapply(levels(clusters_bed$group), function(ci) {
dn_fn <- file.path(hom_dir,"results",sprintf("%s-Homer",ci),"homerResults.txt")
dn_dt <- ParseHomerDenovo(dn_fn)
kn_fn <- file.path(hom_dir,"results",sprintf("%s-Homer",ci),"knownResults.txt")
kn_dt <- ParseHomerKnown(kn_fn)
hm_dt <- rbindlist(list(denovo=dn_dt, known=kn_dt), idcol="set")
hm_dt
})
names(hm_list) <- levels(clusters_bed$group)
hm_dt <- rbindlist(hm_list, idcol = "group")
fwrite(hm_dt, file.path(hom_dir, "results", "allResuls.txt"), sep="\t", col.names = TRUE)
Save all Homer motifs
mt_dt <- unique(hm_dt[,.(name)])
mt_list <- lapply(levels(clusters_bed$group), function(ci) {
ci_dir <- file.path(hom_dir,"results",sprintf("%s-Homer",ci))
ci_mfn <- c(
list.files(file.path(ci_dir, "homerResults"), pattern = "motif\\d+.motif", full.names=TRUE),
list.files(file.path(ci_dir, "knownResults"), pattern = "known\\d+.motif", full.names=TRUE)
)
ci_mot <- lapply(ci_mfn, universalmotif::read_homer)
ci_mot
})
names(mt_list) <- levels(clusters_bed$group)
saveRDS(mt_list, file.path(hom_dir,"results","allMotifs.RDS"))
Previously we pulled together Nematostella direct and inferred motifs from CisBP, and motifs additionally transferred based on the DBD %ID. We load them now, together with motifs found by Homer, then we do motif clustering and archetyping.
Load Homer motifs
# load Homer motifs
data_homer <- fread(file.path(hom_dir, "results","allResuls.txt"))
pwms_homer <- readRDS(file.path(hom_dir,"results","allMotifs.RDS"))
pwms_homer <- unlist(unname(pwms_homer))
names(pwms_homer) <- sapply(pwms_homer, function(x) x@name)
Parsed annotation for CisBP motifs
Load CisBP Nematostella motifs
# CisBP motifs + additionally transferred based on %ID
pwms_files_cisbp <- file.path("annotation","CisBP_ident_transfered_motifs.RDS")
pwms_cisbp <- readRDS(pwms_files_cisbp) # 2382
data_cisbp <- fread(str_replace(pwms_files_cisbp,"RDS","tsv"))
gen_cisbp <- data_cisbp$Gene_ID
nms_cisbp <- data_cisbp$name
pwms_cisbp_nm <- lapply(seq_along(pwms_cisbp), function(i) {
motif <- pwms_cisbp[[i]]
if (!is.na(gen_cisbp[i])) motif@altname <- gen_cisbp[i]
if (!is.na(nms_cisbp[i])) motif@extrainfo <- nms_cisbp[i]
return(motif)
})
names(pwms_cisbp_nm) <- names(pwms_cisbp)
Save all motifs
pwms <- c(pwms_homer,pwms_cisbp)
saveRDS(pwms, file.path(res_dir,"motifs-all.rds"))
Parameters for similarity and archetyping
# motif similarity
similarity = "PPM"
method = "PCC"
normalise.scores = TRUE
if (normalise.scores == TRUE) {
normalization <- "norm"
} else {
normalization <- ""
}
# clustering for archetyping
min_cluster_similarity <- 0.8
hclust_method <- "complete"
dist_method <- "euclidean"
# archetyping threshold
IC_threshold <- 0.5
len_threshold <- 8
Calculate pairwise similarity
# motifs
pwms <- readRDS(file.path(res_dir,"motifs-all.rds"))
# similarity
sim_mat <- compare_motifs(
motifs = pwms,
use.type = similarity,
method = method,
normalise.scores = normalise.scores,
min.position.ic = 0,
min.mean.ic = 0
)
rownames(sim_mat) <- colnames(sim_mat) <- names(pwms)
saveRDS(sim_mat,file.path(res_dir,sprintf("motifs-similarity-%s-%s%s.rds",similarity,method,normalization)))
Cluster and order similarity matrix, then generate archetype motifs.
Choose the minimum cluster similarity appropriately so that clusters of motifs to archetype contain only similar motifs (e.g. when using a higher value of 0.8, many cluster contain outlier motifs, with lower values these get split).
# motifs and similarity
pwms <- readRDS(file.path(res_dir,"motifs-all.rds"))
sim_mat <- readRDS(file.path(res_dir,sprintf("motifs-similarity-%s-%s%s.rds",similarity,method,normalization)))
# ordering
reclust_motifs <- TRUE
ord <- rownames(sim_mat)
if (reclust_motifs) {
hc <- hclust(dist(sim_mat, method = dist_method), method = hclust_method)
ord <- hc$labels[hc$order]
}
cuts <- seq(200,300,10)
cuts_scores <- sapply(cuts, function(h) {
ctr <- cutree(hc, k=h)
cl_scores <- sapply(unique(ctr),function(x) {
ms <- names(ctr[ctr==x])
within_cl <- median(sim_mat[ms,ms], na.rm=TRUE)
between_cl <- median(unlist(sim_mat[!(rownames(sim_mat)%in%ms),ms]), unlist(sim_mat[ms,!(colnames(sim_mat)%in%ms)]), na.rm=TRUE)
if (is.na(between_cl)) between_cl <- 1
within_cl/between_cl
})
mean(cl_scores, na.rm=TRUE)
})
k <- cuts[which.max(cuts_scores)]
plot(cuts,cuts_scores)
abline(v=k)
ctr <- cutree(hc, k=k)
# archetyping
arch <- mta_merge_archetype(
sim_mat = sim_mat,
motifs = pwms,
clusters = ctr,
min_cluster_similarity = min_cluster_similarity,
recluster = FALSE,
block_filter = TRUE,
bkg = rep(0.25,4),
pseudocount = 0.0001,
IC_threshold = IC_threshold,
len_threshold = len_threshold,
occupancy_threshold = 1,
verbose = TRUE
)
# list of 1118 archetypes including 3276 motifs (131 archetype(s) fail filters)
arch <- arch[sapply(arch, function(x) length(x)>0)]
arch_file <- file.path(res_dir,sprintf("motif-archetypes-%s-%s%s-%s-IC%s-%sbp.rds",similarity,method,normalization,min_cluster_similarity,IC_threshold,len_threshold))
saveRDS(arch, arch_file)
Annotate archetype motifs (dictionary).
# archetype motifs
arch_file <- file.path(res_dir,"motif-archetypes-PPM-PCCnorm-0.8-IC0.5-8bp.rds")
arch <- readRDS(arch_file)
# cisbp direct and inferred motifs for nematostella assigned to genes
CisBP_ann_file <- file.path("annotation","CisBP_ident_transfered_motifs.tsv")
cisbp_ann <- fread(CisBP_ann_file)
setnames(cisbp_ann,c("Gene_ID","Motif_ID"),c("gene","motif"))
cisbp_tfs <- cisbp_ann[,.(gene,motif)][]
TF_motifs_file <- file.path(res_dir,"CisBP_ident_transfered_genes_to_motifs.tsv")
fwrite(cisbp_tfs, TF_motifs_file, sep="\t")
# cisbp family annotation for all direct TFs
CisBP_family_annotation_file <- file.path("annotation","CisBP_2021_08_11_TF_Information.txt")
# tf annotations
TF_annotation_file <- file.path("annotation","curated_TFh_Nvec_DToL_names.tsv")
TF_family_annotation_file <- file.path("annotation","gene_families_searchinfo.csv")
# mapping between CisBP and our TF family annotations
CisBP_TF_family_mapping_file <- file.path("annotation","CisBP_TF_mapping.tsv")
# make dictionary
dict <- mta_archetype_dictionary(
arch = arch,
TF_annotation_file = TF_annotation_file,
TF_motifs_file = TF_motifs_file,
TF_family_annotation_file = TF_family_annotation_file,
CisBP_family_annotation_file = CisBP_family_annotation_file,
CisBP_TF_family_mapping_file = CisBP_TF_family_mapping_file
)
# save dictonary
fwrite(dict, str_replace(arch_file, ".rds$",".dict"), sep="\t", quote=FALSE, col.names = TRUE)
arch_file <- file.path(res_dir,"motif-archetypes-PPM-PCCnorm-0.8-IC0.5-8bp.rds")
dict <- fread(str_replace(arch_file, ".rds$",".dict"))
dict_nmot <- dict[,.(archetype_name,archetype_num_motifs)]
gp_nmot <- ggplot(dict_nmot, aes(archetype_num_motifs)) +
geom_bar(color="white") +
scale_x_continuous(expand = expansion(mult=0.01), breaks = c(1,seq(10,100,10))) +
scale_y_continuous(expand = expansion(mult=c(0,0.01))) +
theme(panel.grid.major = element_line(size=0.5)) +
labs(x="number of motifs per archetype", y="number of archetypes")
gp_nmot
Plot archetyping clusters.
# plot archetyping clusters
archetyping_file <- file.path(
fig_dir,
basename(file.path(str_replace(arch_file, "\\.rds$", "-archetyping.pdf")))
)
mta_plot_archetype(arch = arch, dict = dict, output_file = archetyping_file)
# plot archetype logos
pdf(file.path(fig_dir, basename(str_replace(arch_file, "\\.rds$", "-archetypes.pdf"))), width=8, height=3)
for (x in seq_along(arch)) {
message(x)
motif <- arch[[x]]$ppm_consensus
motif@name <- dict[archetype_num==paste0("ARCH",x)]$archetype_name[1]
motif@alphabet <- "DNA"
tryCatch({
print(view_motifs(
motifs = motif,
relative_entropy = TRUE,
normalise.scores = FALSE
) + labs(title = x))
}, error=function(e) message(sprintf("Failed to plot ARCH%s\n%s",x,e)))
}
dev.off()
Save archetype motifs.
# load archetyping results
arch <- readRDS(arch_file)
arch_list <- lapply(arch, function(x) x$ppm_consensus)
# add archetype names to motifs
dict <- fread(file.path(
res_dir,
"motif-archetypes-PPM-PCCnorm-0.8-IC0.5-8bp.dict"
))
arch_nms <- dict[match(
sapply(arch_list, function(x) x@name),
archetype
)]$archetype_name
names(arch_list) <- arch_nms
arch_list_nm <- lapply(seq_along(arch_list), function(i) {
x <- arch_list[[i]]
x@name <- arch_nms[i]
x
})
names(arch_list_nm) <- arch_nms
saveRDS(arch_list_nm, str_replace(arch_file, ".rds$","-pwms.rds"))
universalmotif::write_homer(
motifs = arch_list_nm,
file = str_replace(arch_file, ".rds$","-pwms.homer"),
overwrite = TRUE
)
arch_list_nm <- lapply(arch_list_nm, function(x) {
x@alphabet <- "DNA"
x
})
universalmotif::write_meme(
motifs = arch_list_nm,
file = str_replace(arch_file, ".rds$","-pwms.meme"),
overwrite = TRUE
)
Plot heatmap of similarity of all motifs used in archetyping (i.e. motifs after filtering), with archetyping clusters indicated.
require(ComplexHeatmap)
pwms <- readRDS(file.path(res_dir,"motifs-all.rds"))
sim_mat_file <- file.path(res_dir,"motifs-similarity-PPM-PCCnorm.rds")
sim_mat <- readRDS(sim_mat_file)
arch_file <- file.path(res_dir,"motif-archetypes-PPM-PCCnorm-0.8-IC0.5-8bp.rds")
arch <- readRDS(arch_file)
dict <- fread(str_replace(arch_file, ".rds$",".dict"))
names(arch) <- unique(dict$archetype_name)
heatmap_file <- file.path(
fig_dir,
str_replace_all(basename(sim_mat_file), c("similarity"="similarity-archetypes", ".rds$"=".pdf"))
)
mta_plot_archetype_heatmap(
sim_mat = sim_mat,
arch = arch,
dict = dict,
output_file = heatmap_file,
height = 14,
width = 16
)
We first identify expressed genes and TFs in groups that were defined by clustering.
# TF annotation
tfs_annt <- fread(
file.path("annotation", "curated_TFh_Nvec_DToL_names.tsv"),
header = FALSE
)
setnames(tfs_annt, c("gene", "pfam", "og"))
# groupped genes
mark_dt <- fread(file.path("RNASEQ_QUANTIFICATION", "results", "genes_clusters.tsv"))
mark_dt[, group := paste0("G", group)]
gene_dt <- fread(file.path("RNASEQ_QUANTIFICATION", "results", "genes_clusters_predicted.tsv"))
gene_dt[, group := group_xgb]
group_dt <- rbindlist(list(
clustered = mark_dt[, .(gene, group)],
predicted = gene_dt[, .(gene, group)]
), idcol = "method")
group_dt[, group := factor(
group,
levels = paste0("G", unique(
sort(as.integer(str_remove(group_dt$group, "G")))
))
)]
message(sprintf(
"Total genes: %i", length(group_dt$gene)
))
## Total genes: 20535
# expressed genes
con_mt <- read.table(
file.path("RNASEQ_QUANTIFICATION", "raw_counts_rnaseq.tsv"),
header=TRUE, row.names = 1
)
expr_genes <- names(which(apply(con_mt, 1, function(x) sum(x) > 10) == TRUE))
message(sprintf(
"Expressed genes: %i", length(expr_genes)
))
## Expressed genes: 20080
# expressed TFs
expr_tfs <- expr_genes[expr_genes %in% tfs_annt$gene]
message(sprintf(
"Expressed TFs: %i", length(expr_tfs)
))
## Expressed TFs: 611
expr_dt <- group_dt[gene %in% expr_genes]
expr_dt[, TF := gene %in% expr_tfs]
expr_dt[, TF := factor(TF, levels = c(TRUE, FALSE))]
setorder(expr_dt, group, TF)
How many TFs (do not) have archetypes, and how many archetypes (do not) have a TF?
# archetype dictionary
dict_fnm <- file.path(
res_dir,
"motif-archetypes-PPM-PCCnorm-0.8-IC0.5-8bp.dict"
)
dict <- fread(dict_fnm)
message(sprintf(
"Fraction of TFs with motifs: %s",
mean(expr_tfs %in% dict$gene)
))
# TF annotation
tf_exp <- unique(tfs_annt[gene %in% expr_tfs][, .SD[1], gene])
# TFs without an archetype
tfs_dt <- unique(dict[gene != ""][
, .(archetype_name, gene)])[
, .(number_of_motifs = .N), gene][
order(number_of_motifs)]
orp_tfs <- unique(tf_exp[!gene %in% dict$gene]$gene)
gp_tfs <- ggplot(tfs_dt, aes(number_of_motifs)) +
geom_bar(color = "white") +
scale_x_continuous(expand = expansion(mult = 0.01), breaks = c(1, seq(10, 100, 10))) +
scale_y_continuous(expand = expansion(mult = c(0, 0.01))) +
theme(panel.grid.major = element_line(size = 0.5)) +
labs(
x = "number of archetype motifs per TF",
y="number of TFs",
caption = sprintf(
"%s TFs with motif(s); %s TFs without a motif",
nrow(tfs_dt), length(orp_tfs)
)
)
# archetypes without a TF
arc_dt <- unique(dict[gene != ""][
, .(archetype_name, gene)])[
, .(number_of_genes = .N), archetype_name][
order(number_of_genes)]
orp_arc <- unique(dict[!archetype_name %in% arc_dt$archetype_name]$archetype_name)
gp_arc <- ggplot(arc_dt, aes(number_of_genes)) +
geom_bar(color="white") +
scale_x_continuous(expand = expansion(mult=0.01)) +
scale_y_continuous(expand = expansion(mult=c(0,0.01))) +
theme(panel.grid.major = element_line(size=0.5)) +
labs(
x = "number of TFs per archetype motif",
y="number of archetype motifs",
caption = sprintf(
"%s motifs with gene(s); %s motifs without a gene",
nrow(arc_dt), length(orp_arc)
)
)
gp_tfs / gp_arc
How many TFs expressed in each group have motifs?
tfs_dt <- merge.data.table(
expr_dt[TF == TRUE][, TF := NULL],
unique(dict[, .(gene, og, pfam, archetype, archetype_name)]),
by = "gene", all.x = TRUE, sort = FALSE
)
dt_tf_1 <- unique(
tfs_dt[, .(gene, archetype, group)][
order(archetype)][
, .SD[1], gene]
)
gp_tf_1 <- ggplot(
dt_tf_1,
aes(group, fill = !is.na(archetype))) +
geom_bar() +
scale_fill_manual(
"",
values = c("TRUE" = "#0ab30a", "FALSE" = "#b41b1b"),
labels = c(
"TRUE" = sprintf(
"motif (%i)",
length(unique(dt_tf_1[!is.na(archetype)]$gene))
),
"FALSE" = sprintf(
"no motif (%i)",
length(unique(dt_tf_1[is.na(archetype)]$gene))
)
)
) +
scale_y_continuous(expand = expansion(mult = c(0, 0.01))) +
labs(y = "all TFs", x = "")
dt_tf_2 <- unique(
tfs_dt[method == "clustered"][
, .(gene, archetype, group)][
order(archetype)][
, .SD[1], gene]
)
gp_tf_2 <- ggplot(
dt_tf_2,
aes(group, fill = !is.na(archetype))) +
geom_bar() +
scale_fill_manual(
"",
values = c("TRUE" = "#0ab30a", "FALSE" = "#b41b1b"),
labels = c(
"TRUE" = sprintf(
"motif (%i)",
length(unique(dt_tf_2[!is.na(archetype)]$gene))
),
"FALSE" = sprintf(
"no motif (%i)",
length(unique(dt_tf_2[is.na(archetype)]$gene))
)
)
) +
scale_y_continuous(expand = expansion(mult = c(0, 0.01))) +
labs(y = "marker TFs", x = "")
gp_ft <- (gp_tf_1 / gp_tf_2 & theme(
legend.position = "top",
panel.grid.major.y = element_line(size = 0.5)
))
gp_ft
To match missing TFs and motifes, we classified them in TF families. We will focus on TFs used for clustering here.
# TF families
TF_family_annotation_file <- file.path("annotation","gene_families_searchinfo.csv")
tf_fams <- fread(TF_family_annotation_file)
# clustering markers
mark_dt <- fread(file.path(
"RNASEQ_QUANTIFICATION", "results", "genes_clusters.tsv"
))
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# missing TFs
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
orp_tfs_dt <- tfs_annt[gene %in% mark_dt$gene]
orp_tfs_dt <- unique(orp_tfs_dt[gene %in% orp_tfs][, .SD[1], gene])
# get family info from gene og
orp_tfs_dt[og!="", tf_family := str_remove(og, "\\.(?<=\\.).+")]
orp_tfs_dt[, tf_family := str_replace_all(tf_family, "AP2", "AP-2")]
# summarize
orp_tfs_dtn <- orp_tfs_dt[, .N,tf_family][
, prop := N / sum(N)][
order(-N)]
orp_tfs_dtn_plot <- copy(orp_tfs_dtn)
orp_tfs_dtn_plot <- unique(orp_tfs_dtn_plot)
orp_tfs_dtn_plot[, tf_family := factor(
tf_family,
levels = orp_tfs_dtn_plot$tf_family
)]
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# missing motifs
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
orp_arc_dt <- unique(dict[archetype_name %in% orp_arc])
# get family info from archetype name
orp_arc_dt[tf_family == "", tf_family := str_extract(
archetype_name, paste(tf_fams[[1]], collapse = "|")
)]
# get family info from motifs names
orp_arc_na <- orp_arc_dt[is.na(tf_family), ][
, tf_family := str_extract(motif,paste(tf_fams[[1]],collapse="|"))][
, .(archetype,tf_family)][
!is.na(tf_family)]
orp_arc_vc <- structure(orp_arc_na$tf_family, names = orp_arc_na$archetype)
orp_arc_dt[archetype %in% names(orp_arc_vc), tf_family := orp_arc_vc[archetype]]
# guess family info from archetype names
grep_list2 <- list(
"fox" = "Forkhead",
"hox" = "Homeodomains",
"sox" = "HMGbox_Sox",
"runx|runt" = "Runt_Runx",
"mads|srf" = "MADS-box_SRF",
"Myb_DNA-bind" = "Myb"
)
nms <- unlist(strsplit(tf_fams[[2]], ","))
fms <- unname(unlist(lapply(1:nrow(tf_fams), function(i)
rep(tf_fams[i,1], length(strsplit(as.character(tf_fams[i,2]), ",")[[1]]))
)))
grep_list1 <- fms; names(grep_list1) <- nms
grep_list <- c(grep_list1, grep_list2)
for (gp in names(grep_list)) {
orp_arc_dt[is.na(tf_family) & grepl(gp, archetype, ignore.case = TRUE),
tf_family := grep_list[[gp]]]
}
orp_arc_dt[, tf_family := str_replace_all(tf_family, "AP2", "AP-2")]
orp_arc_dt[is.na(tf_family), tf_family := "unknown"]
orp_arc_dtn <- unique(orp_arc_dt[, .(archetype, tf_family)])
# summarize with unknown
orp_arc_dtn_1 <- orp_arc_dt[
, .N, tf_family][
, prop := N / sum(N)][order(-N)]
orp_arc_dtn_1[, tf_family := factor(tf_family, levels = orp_arc_dtn_1$tf_family)]
# summarize without unknown
orp_arc_dtn_2 <- orp_arc_dt[tf_family != "unknown"][
, .N, tf_family][
, prop := N / sum(N)][order(-N)]
orp_arc_dtn_2[, tf_family := factor(tf_family, levels = orp_arc_dtn_2$tf_family)]
# colors
tf_fams <- unique(sort(c(tf_fams[[1]], unique(orp_tfs_dt$tf_family))))
tf_fams <- tf_fams[tf_fams %in% c(as.character(orp_tfs_dtn$tf_family), as.character(orp_arc_dt$tf_family))]
tf_fams <- str_replace_all(tf_fams, "AP2", "AP-2")
tf_fams_cols <- structure(
c(colorRampPalette(RColorBrewer::brewer.pal(8,'Paired'))(length(tf_fams)), "khaki", "grey"),
names = c(tf_fams, "other", "unknown")
)
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# plots
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
gp_orp_tfs <- ggplot(orp_tfs_dtn_plot, aes("", N, fill = tf_family)) +
geom_bar(stat = "identity", width = 1, color = "black") +
coord_polar("y", start=0) +
scale_fill_manual(values = tf_fams_cols, limits = force) +
geom_text(
aes(label = ifelse(
N < 1,
"",
sprintf("%s (%s)", tf_family, N)
), x = 1.4),
position = position_stack(vjust = 0.5),
color = "black"
) +
labs(title = "orphan TFs") +
theme_void() +
theme(legend.position = "none")
gp_orp_arc_1 <- ggplot(orp_arc_dtn_1, aes("", N, fill = tf_family)) +
geom_bar(stat="identity", width = 1, color = "black") +
coord_polar("y", start = 0) +
scale_fill_manual(values = tf_fams_cols, limits = force) +
geom_text(
aes(label = ifelse(
N < 10,
"",
sprintf("%s (%s)", tf_family, N)
), x = 1.4),
position = position_stack(vjust = 0.5),
color = "black"
) +
labs(title = "orphan motif archetypes") +
theme_void() +
theme(legend.position = "none")
gp_orp_arc_2 <- ggplot(orp_arc_dtn_2, aes("", N, fill = tf_family)) +
geom_bar(stat="identity", width = 1, color = "black") +
coord_polar("y", start = 0) +
scale_fill_manual(values = tf_fams_cols, limits = force) +
geom_text(
aes(label = ifelse(
N < 1,
"",
sprintf("%s (%s)", tf_family, N)
), x = 1.4),
position = position_stack(vjust = 0.5),
color = "black"
) +
theme_void() +
labs(title = "(without unknown)") +
theme(legend.position = "none")
# save
unmatched_lst <- list(
tfs = orp_tfs_dt,
motifs = orp_arc_dt
)
saveRDS(unmatched_lst, file.path(
res_dir, "unmatched-tfs-motifs.rds"
))
# plot
gp_orp_tfs + gp_orp_arc_1 + gp_orp_arc_2
orp_dtn <- rbindlist(list(
"TFs" = orp_tfs_dtn,
"archetypes" = orp_arc_dtn_1
), idcol = "variable")
orp_dtn <- orp_dtn[tf_family != "unknown"]
gp_orp <- ggplot(orp_dtn, aes(tf_family)) +
geom_bar(
data = subset(orp_dtn, variable == "TFs"),
aes(y = N, fill = tf_family),
stat = "identity",
position = "dodge") +
geom_bar(
data = subset(orp_dtn, variable == "archetypes"),
aes(y = -N, fill = tf_family),
stat = "identity",
position = "dodge"
) +
coord_flip() +
scale_fill_manual(values = tf_fams_cols, limits = force) +
scale_y_continuous(limits = c(NA, NA), labels = abs) +
geom_hline(yintercept = 0, colour = "black", size = 0.25) +
theme(
legend.position = "none",
axis.line = element_blank(),
panel.grid.major.x = element_line(size=0.5)
) +
labs(y = "TFs \t motifs", x = "TF family")
gp_orp
Score archetype motifs in consensus peaks
Motif enrichment in groups defined before
Motif enrichment heatmap
# enrichment scores
dt <- fread(
file.path(res_dir, "motif-enrich-archetypes-mona.tsv")
)
setnames(dt, "label", "group")
# qvalue
dat_qv <- dcast.data.table(
unique(dt[, .(motif, group, padj)]), motif ~ group, value.var = "padj"
)
mat_qv <- data.matrix(dat_qv)[,-1]
mat_qv[is.na(mat_qv)] <- 1
rownames(mat_qv) <- dat_qv$motif
write.table(
mat_qv,
file.path(res_dir, "motif-enrich-archetypes-mona-qvalue.tsv"),
sep = "\t", quote = FALSE
)
# log2fc
dat_fc <- dcast.data.table(
unique(dt[,.(motif, group, fc)]), motif ~ group, value.var = "fc"
)
mat_fc <- data.matrix(dat_fc)[,-1]
mat_fc[is.na(mat_fc)] <- 0
rownames(mat_fc) <- dat_fc$motif
write.table(
mat_fc,
file.path(res_dir, "motif-enrich-archetypes-mona-fc.tsv"),
sep = "\t", quote = FALSE
)
# convert to numeric (replacing , with .) and multiply with -1 (so that highly significant = higher value)
dt[, minuslog10qval := -1*log10(padj)]
# filtering
ids <- apply(mat_fc, 1, function(x) max(x) > 1) &
apply(mat_qv, 1, function(x) !is.infinite(min(x)) & !min(x) > 0.001)
# clustering
hc <- hclust(dist(mat_fc[ids, ]), method = "ward.D2")
ds <- dt[motif %in% rownames(mat_fc)[ids]]
ds[, motif := factor(motif, levels = rev(rownames(mat_fc)[ids]))]
# ordering
mord <- order(apply(mat_fc[ids,], 1, which.max))
ds[, motif := factor(motif, levels=rev(rownames(mat_fc[ids,])[mord]))]
# limit -log10FDR range
ds[, minuslog10qval := pmin(minuslog10qval, 20)]
# add archetype gene annotations
dict <- fread(file.path(
res_dir,
"motif-archetypes-PPM-PCCnorm-0.8-IC0.5-8bp.dict"
))
dc <- unique(dict[,.(archetype_name, gene)])
setnames(dc, "archetype_name", "motif")
mt_dt <- merge.data.table(
ds, dc,
by = "motif",
all.x = TRUE
)
# add tf annotations
marks_gold <- fread(
file.path("annotation", "golden-marks-230116.tsv"),
fill = TRUE
)[, 1:2]
setnames(marks_gold, c("gene", "name"))
marks_gold[, name := str_remove(name, "^Nv")]
tfs_annt <- fread(
file.path("annotation", "curated_TFh_Nvec_DToL_names.tsv"),
header = FALSE
)
setnames(tfs_annt, c("gene", "pfam", "og"))
marks_tfs <- merge.data.table(
tfs_annt,
marks_gold,
by = "gene", all = TRUE, sort = FALSE
)
marks_tfs[is.na(marks_tfs)] <- ""
mt_marks_dt <- merge.data.table(
mt_dt, marks_tfs,
by = "gene",
all.x = TRUE
)
mt_marks_dt[is.na(name), name := ""]
mt_marks_dt[name == "" & pfam != "", name := pfam]
mt_marks_dt[name == "", name := motif]
mt_marks_dt[, motif := factor(motif, levels = levels(ds$motif))]
# add labels
mt_marks_dt[, label := str_extract(name, "(?<=BestGuess_).+")]
mt_marks_dt[is.na(label), label := str_extract(name, "(?<=ARCH\\d{1,3}_).+")]
mt_marks_dt[is.na(label), label := name]
mt_marks_dt[!(pval < 1e5 & fc > 4), label := NA]
# filter labels by RNA plot labels
labels_dt <- marks_tfs[name != ""][, .(name, gene)]
labels_v <- structure(labels_dt$name, names = labels_dt$gene)
mt_marks_dt[is.na(label), label := labels_v[gene]]
mt_marks_dt[!(pval < 1e5 & fc > 3), label := NA]
# plot
labels_p <- function(
data,
column_name,
padj_thrs = 1e-3,
fc_thrs = 4,
operation = c("|","&")[1]
) {
return(
function(m) {
dl <- copy(data)
dl[, column_name := dl[[column_name]]]
dl <- dl[order(padj)][, .SD[1], .(column_name)]
dl <- dl[match(m, column_name)]
if (any(c("|", "OR", "or", "union") %in% operation[1])) {
m[dl$padj < padj_thrs | dl$fc > fc_thrs]
} else if (any(c("&", "AND", "and", "intersect") %in% operation[1])) {
m[dl$padj < padj_thrs & dl$fc > fc_thrs]
}
}
)
}
gp <- ggplot(mt_marks_dt, aes(group, motif, label = label)) +
geom_point(
aes(size = minuslog10qval, fill = fc),
shape = 21,
color = "black"
) +
scale_fill_gradientn(
name = "enrichmnet\nfold change",
colours = c("white","#fee5d9","#fcae91","#fb6a4a","#de2d26","#a50f15", "#7a0105"),
breaks = c(0, 2, 4, 6)
) +
scale_size_continuous(
name = "-log10 FDR",
breaks = c(0, 10, 20)
) +
#scale_y_discrete(
# breaks = labels_p(
# data = mt_marks_dt,
# column_name = "motif",
# padj_thrs = 1e-3,
# fc_thrs = 2,
# operation = "&"
# )
#) +
geom_text_repel(
force = 0.5,
nudge_x = -0.25,
direction = "y",
hjust = 1,
segment.size = 0.2
) +
theme(
panel.grid.major = element_line(size = 0.25),
axis.text.y = element_text(size = 8)
)
gp
## Warning: Removed 1459 rows containing missing values (geom_text_repel).
Match by TF expression and motif enrichment in groups.
unmatched_lst <- readRDS(file.path(
res_dir, "unmatched-tfs-motifs.rds"
))
tfs_df <- unmatched_lst$tfs
mot_df <- unmatched_lst$motifs
tfs_fm <- unique(tfs_df$tf_family)
mot_fm <- unique(mot_df$tf_family)
# TFs group
tfs_grp <- fread(
file.path("RNASEQ_QUANTIFICATION", "results", "genes_clusters.tsv")
)[, .(gene, group)]
tfs_grp <- tfs_grp[gene %in% tfs_df$gene]
tfs_grp[, group := paste0("G", group)]
tfs_vct <- structure(tfs_grp$group, names = tfs_grp$gene)
tfs_df[, group := tfs_vct[gene]]
# motif group
mot_mat <- read.table(
file.path(res_dir, "motif-enrich-archetypes-mona-fc.tsv")
)
mot_vct <- apply(mot_mat, 1, function(x) colnames(mot_mat)[which.max(x)])
mot_enr <- apply(mot_mat, 1, max)
mot_df[, group := mot_vct[archetype_name]]
mot_df[, fc := mot_enr[archetype_name]]
# matching
tfs_mot_map <- function(tfs_df, mot_df) {
tfs <- tfs_df$gene
mts <- sapply(tfs, function(g) {
tfs_dg <- tfs_df[gene == g]
mot_dg_fam <- mot_df[tf_family == tfs_dg$tf_family]
mot_dg_grp <- mot_dg_fam[group == tfs_dg$group]
if (!nrow(mot_dg_grp) > 0) {
NA
} else {
mot_dg_top <- mot_dg_grp[order(-fc)][1]
mot_dg_top$archetype_name
}
}, USE.NAMES = TRUE)
message(sprintf(
"%i TFs were mapped to motifs;",
length(mts[!is.na(mts)])
))
message(sprintf(
"%i TFs could not be mapped.",
length(mts[is.na(mts)])
))
mts
}
tf_mot_mappings <- tfs_mot_map(tfs_df, mot_df)
## 30 TFs were mapped to motifs;
## 42 TFs could not be mapped.
Update dictionary
dict_fnm <- file.path(
res_dir,
"motif-archetypes-PPM-PCCnorm-0.8-IC0.5-8bp.dict"
)
dict <- fread(dict_fnm)
# TF annotation
tfs_annt <- fread(
file.path("annotation", "curated_TFh_Nvec_DToL_names.tsv"),
header = FALSE
)
setnames(tfs_annt, c("gene", "pfam", "og"))
tf_mot_mappings <- tf_mot_mappings[!is.na(tf_mot_mappings)]
tf_mot_annots <- tfs_annt[match(names(tf_mot_mappings), gene)]
tf_mot_og <- structure(tf_mot_annots$og, names = names(tf_mot_mappings))
tf_mot_pfam <- structure(tf_mot_annots$pfam, names = names(tf_mot_mappings))
tf_mot_dt <- data.table(
"archetype_name" = tf_mot_mappings,
"gene" = names(tf_mot_mappings)
)
dict_new <- merge.data.table(
dict, tf_mot_dt, by = "archetype_name",
all = TRUE, sort = FALSE
)
dict_new[!is.na(gene.y), gene.x := gene.y]
dict_new[, gene.y := NULL]
setnames(dict_new, "gene.x", "gene")
dict_new[gene %in% tf_mot_dt$gene, og := tf_mot_og[gene]]
dict_new[gene %in% tf_mot_dt$gene, pfam := tf_mot_pfam[gene]]
fwrite(
dict_new,
file.path(res_dir, "motif-archetypes-PPM-PCCnorm-0.8-IC0.5-8bp-mapped.dict"),
sep = "\t"
)
We again identify expressed genes and TFs in groups that were defined by clustering.
# TF annotation
tfs_annt <- fread(
file.path("annotation", "curated_TFh_Nvec_DToL_names.tsv"),
header = FALSE
)
setnames(tfs_annt, c("gene", "pfam", "og"))
# groupped genes
mark_dt <- fread(file.path("RNASEQ_QUANTIFICATION", "results", "genes_clusters.tsv"))
mark_dt[, group := paste0("G", group)]
gene_dt <- fread(file.path("RNASEQ_QUANTIFICATION", "results", "genes_clusters_predicted.tsv"))
gene_dt[, group := group_xgb]
group_dt <- rbindlist(list(
clustered = mark_dt[, .(gene, group)],
predicted = gene_dt[, .(gene, group)]
), idcol = "method")
group_dt[, group := factor(
group,
levels = paste0("G", unique(
sort(as.integer(str_remove(group_dt$group, "G")))
))
)]
expr_dt <- group_dt[gene %in% expr_genes]
expr_dt[, TF := gene %in% expr_tfs]
expr_dt[, TF := factor(TF, levels = c(TRUE, FALSE))]
setorder(expr_dt, group, TF)
How many TFs (do not) have archetypes, and how many archetypes (do not) have a TF?
# archetype dictionary
dict_fnm <- file.path(
res_dir,
"motif-archetypes-PPM-PCCnorm-0.8-IC0.5-8bp-mapped.dict"
)
dict <- fread(dict_fnm)
message(sprintf(
"Fraction of TFs with motifs: %s",
mean(expr_tfs %in% dict$gene)
))
# TF annotation
tf_exp <- unique(tfs_annt[gene %in% expr_tfs][, .SD[1], gene])
# TFs without an archetype
tfs_dt <- unique(dict[gene != ""][
, .(archetype_name, gene)])[
, .(number_of_motifs = .N), gene][
order(number_of_motifs)]
orp_tfs <- unique(tf_exp[!gene %in% dict$gene]$gene)
gp_tfs <- ggplot(tfs_dt, aes(number_of_motifs)) +
geom_bar(color = "white") +
scale_x_continuous(expand = expansion(mult = 0.01), breaks = c(1, seq(10, 100, 10))) +
scale_y_continuous(expand = expansion(mult = c(0, 0.01))) +
theme(panel.grid.major = element_line(size = 0.5)) +
labs(
x = "number of archetype motifs per TF",
y="number of TFs",
caption = sprintf(
"%s TFs with motif(s); %s TFs without a motif",
nrow(tfs_dt), length(orp_tfs)
)
)
# archetypes without a TF
arc_dt <- unique(dict[gene != ""][
, .(archetype_name, gene)])[
, .(number_of_genes = .N), archetype_name][
order(number_of_genes)]
orp_arc <- unique(dict[!archetype_name %in% arc_dt$archetype_name]$archetype_name)
gp_arc <- ggplot(arc_dt, aes(number_of_genes)) +
geom_bar(color="white") +
scale_x_continuous(expand = expansion(mult=0.01)) +
scale_y_continuous(expand = expansion(mult=c(0,0.01))) +
theme(panel.grid.major = element_line(size=0.5)) +
labs(
x = "number of TFs per archetype motif",
y="number of archetype motifs",
caption = sprintf(
"%s motifs with gene(s); %s motifs without a gene",
nrow(arc_dt), length(orp_arc)
)
)
gp_tfs / gp_arc
How many TFs expressed in each group have motifs?
tfs_dt <- merge.data.table(
expr_dt[TF == TRUE][, TF := NULL],
unique(dict[, .(gene, og, pfam, archetype, archetype_name)]),
by = "gene", all.x = TRUE, sort = FALSE
)
dt_tf_1 <- unique(
tfs_dt[, .(gene, archetype, group)][
order(archetype)][
, .SD[1], gene]
)
gp_tf_1 <- ggplot(
dt_tf_1,
aes(group, fill = !is.na(archetype))) +
geom_bar() +
scale_fill_manual(
"",
values = c("TRUE" = "#0ab30a", "FALSE" = "#b41b1b"),
labels = c(
"TRUE" = sprintf(
"motif (%i)",
length(unique(dt_tf_1[!is.na(archetype)]$gene))
),
"FALSE" = sprintf(
"no motif (%i)",
length(unique(dt_tf_1[is.na(archetype)]$gene))
)
)
) +
scale_y_continuous(expand = expansion(mult = c(0, 0.01))) +
labs(y = "all TFs", x = "")
dt_tf_2 <- unique(
tfs_dt[method == "clustered"][
, .(gene, archetype, group)][
order(archetype)][
, .SD[1], gene]
)
gp_tf_2 <- ggplot(
dt_tf_2,
aes(group, fill = !is.na(archetype))) +
geom_bar() +
scale_fill_manual(
"",
values = c("TRUE" = "#0ab30a", "FALSE" = "#b41b1b"),
labels = c(
"TRUE" = sprintf(
"motif (%i)",
length(unique(dt_tf_2[!is.na(archetype)]$gene))
),
"FALSE" = sprintf(
"no motif (%i)",
length(unique(dt_tf_2[is.na(archetype)]$gene))
)
)
) +
scale_y_continuous(expand = expansion(mult = c(0, 0.01))) +
labs(y = "marker TFs", x = "")
gp_ft <- (gp_tf_1 / gp_tf_2 & theme(
legend.position = "top",
panel.grid.major.y = element_line(size = 0.5)
))
gp_ft
To match missing TFs and motifes, we classified them in TF families. We will focus on TFs used for clustering here.
# TF families
TF_family_annotation_file <- file.path("annotation","gene_families_searchinfo.csv")
tf_fams <- fread(TF_family_annotation_file)
# clustering markers
mark_dt <- fread(file.path(
"RNASEQ_QUANTIFICATION", "results", "genes_clusters.tsv"
))
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# missing TFs
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
orp_tfs_dt <- tfs_annt[gene %in% mark_dt$gene]
orp_tfs_dt <- unique(orp_tfs_dt[gene %in% orp_tfs][, .SD[1], gene])
# get family info from gene og
orp_tfs_dt[og!="", tf_family := str_remove(og, "\\.(?<=\\.).+")]
orp_tfs_dt[, tf_family := str_replace_all(tf_family, "AP2", "AP-2")]
# summarize
orp_tfs_dtn <- orp_tfs_dt[, .N,tf_family][
, prop := N / sum(N)][
order(-N)]
orp_tfs_dtn_plot <- copy(orp_tfs_dtn)
orp_tfs_dtn_plot <- unique(orp_tfs_dtn_plot)
orp_tfs_dtn_plot[, tf_family := factor(
tf_family,
levels = orp_tfs_dtn_plot$tf_family
)]
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# missing motifs
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
orp_arc_dt <- unique(dict[archetype_name %in% orp_arc])
# get family info from archetype name
orp_arc_dt[tf_family == "", tf_family := str_extract(
archetype_name, paste(tf_fams[[1]], collapse = "|")
)]
# get family info from motifs names
orp_arc_na <- orp_arc_dt[is.na(tf_family), ][
, tf_family := str_extract(motif,paste(tf_fams[[1]],collapse="|"))][
, .(archetype,tf_family)][
!is.na(tf_family)]
orp_arc_vc <- structure(orp_arc_na$tf_family, names = orp_arc_na$archetype)
orp_arc_dt[archetype %in% names(orp_arc_vc), tf_family := orp_arc_vc[archetype]]
# guess family info from archetype names
grep_list2 <- list(
"fox" = "Forkhead",
"hox" = "Homeodomains",
"sox" = "HMGbox_Sox",
"runx|runt" = "Runt_Runx",
"mads|srf" = "MADS-box_SRF",
"Myb_DNA-bind" = "Myb"
)
nms <- unlist(strsplit(tf_fams[[2]], ","))
fms <- unname(unlist(lapply(1:nrow(tf_fams), function(i)
rep(tf_fams[i,1], length(strsplit(as.character(tf_fams[i,2]), ",")[[1]]))
)))
grep_list1 <- fms; names(grep_list1) <- nms
grep_list <- c(grep_list1, grep_list2)
for (gp in names(grep_list)) {
orp_arc_dt[is.na(tf_family) & grepl(gp, archetype, ignore.case = TRUE),
tf_family := grep_list[[gp]]]
}
orp_arc_dt[, tf_family := str_replace_all(tf_family, "AP2", "AP-2")]
orp_arc_dt[is.na(tf_family), tf_family := "unknown"]
orp_arc_dtn <- unique(orp_arc_dt[, .(archetype, tf_family)])
# summarize with unknown
orp_arc_dtn_1 <- orp_arc_dt[
, .N, tf_family][
, prop := N / sum(N)][order(-N)]
orp_arc_dtn_1[, tf_family := factor(tf_family, levels = orp_arc_dtn_1$tf_family)]
# summarize without unknown
orp_arc_dtn_2 <- orp_arc_dt[tf_family != "unknown"][
, .N, tf_family][
, prop := N / sum(N)][order(-N)]
orp_arc_dtn_2[, tf_family := factor(tf_family, levels = orp_arc_dtn_2$tf_family)]
# colors
tf_fams <- unique(sort(c(tf_fams[[1]], unique(orp_tfs_dt$tf_family))))
tf_fams <- tf_fams[tf_fams %in% c(as.character(orp_tfs_dtn$tf_family), as.character(orp_arc_dt$tf_family))]
tf_fams <- str_replace_all(tf_fams, "AP2", "AP-2")
tf_fams_cols <- structure(
c(colorRampPalette(RColorBrewer::brewer.pal(8,'Paired'))(length(tf_fams)), "khaki", "grey"),
names = c(tf_fams, "other", "unknown")
)
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# plots
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
gp_orp_tfs <- ggplot(orp_tfs_dtn_plot, aes("", N, fill = tf_family)) +
geom_bar(stat = "identity", width = 1, color = "black") +
coord_polar("y", start=0) +
scale_fill_manual(values = tf_fams_cols, limits = force) +
geom_text(
aes(label = ifelse(
N < 1,
"",
sprintf("%s (%s)", tf_family, N)
), x = 1.4),
position = position_stack(vjust = 0.5),
color = "black"
) +
labs(title = "orphan TFs") +
theme_void() +
theme(legend.position = "none")
gp_orp_arc_1 <- ggplot(orp_arc_dtn_1, aes("", N, fill = tf_family)) +
geom_bar(stat="identity", width = 1, color = "black") +
coord_polar("y", start = 0) +
scale_fill_manual(values = tf_fams_cols, limits = force) +
geom_text(
aes(label = ifelse(
N < 10,
"",
sprintf("%s (%s)", tf_family, N)
), x = 1.4),
position = position_stack(vjust = 0.5),
color = "black"
) +
labs(title = "orphan motif archetypes") +
theme_void() +
theme(legend.position = "none")
gp_orp_arc_2 <- ggplot(orp_arc_dtn_2, aes("", N, fill = tf_family)) +
geom_bar(stat="identity", width = 1, color = "black") +
coord_polar("y", start = 0) +
scale_fill_manual(values = tf_fams_cols, limits = force) +
geom_text(
aes(label = ifelse(
N < 1,
"",
sprintf("%s (%s)", tf_family, N)
), x = 1.4),
position = position_stack(vjust = 0.5),
color = "black"
) +
theme_void() +
labs(title = "(without unknown)") +
theme(legend.position = "none")
# save
unmatched_lst <- list(
tfs = orp_tfs_dt,
motifs = orp_arc_dt
)
saveRDS(unmatched_lst, file.path(
res_dir, "unmatched-tfs-motifs-2.rds"
))
# plot
gp_orp_tfs + gp_orp_arc_1 + gp_orp_arc_2
orp_dtn <- rbindlist(list(
"TFs" = orp_tfs_dtn,
"archetypes" = orp_arc_dtn_1
), idcol = "variable")
orp_dtn <- orp_dtn[tf_family != "unknown"]
gp_orp <- ggplot(orp_dtn, aes(tf_family)) +
geom_bar(
data = subset(orp_dtn, variable == "TFs"),
aes(y = N, fill = tf_family),
stat = "identity",
position = "dodge") +
geom_bar(
data = subset(orp_dtn, variable == "archetypes"),
aes(y = -N, fill = tf_family),
stat = "identity",
position = "dodge"
) +
coord_flip() +
scale_fill_manual(values = tf_fams_cols, limits = force) +
scale_y_continuous(limits = c(NA, NA), labels = abs) +
geom_hline(yintercept = 0, colour = "black", size = 0.25) +
theme(
legend.position = "none",
axis.line = element_blank(),
panel.grid.major.x = element_line(size=0.5)
) +
labs(y = "TFs \t motifs", x = "TF family")
gp_orp
Motif enrichment heatmap
# enrichment scores
dt <- fread(
file.path(res_dir, "motif-enrich-archetypes-mona.tsv")
)
setnames(dt, "label", "group")
# qvalue
dat_qv <- dcast.data.table(
unique(dt[, .(motif, group, padj)]), motif ~ group, value.var = "padj"
)
mat_qv <- data.matrix(dat_qv)[,-1]
mat_qv[is.na(mat_qv)] <- 1
rownames(mat_qv) <- dat_qv$motif
write.table(
mat_qv,
file.path(res_dir, "motif-enrich-archetypes-mona-qvalue.tsv"),
sep = "\t", quote = FALSE
)
# log2fc
dat_fc <- dcast.data.table(
unique(dt[,.(motif, group, fc)]), motif ~ group, value.var = "fc"
)
mat_fc <- data.matrix(dat_fc)[,-1]
mat_fc[is.na(mat_fc)] <- 0
rownames(mat_fc) <- dat_fc$motif
write.table(
mat_fc,
file.path(res_dir, "motif-enrich-archetypes-mona-fc.tsv"),
sep = "\t", quote = FALSE
)
# convert to numeric (replacing , with .) and multiply with -1 (so that highly significant = higher value)
dt[, minuslog10qval := -1*log10(padj)]
# filtering
ids <- apply(mat_fc, 1, function(x) max(x) > 1) &
apply(mat_qv, 1, function(x) !is.infinite(min(x)) & !min(x) > 0.001)
# clustering
hc <- hclust(dist(mat_fc[ids, ]), method = "ward.D2")
ds <- dt[motif %in% rownames(mat_fc)[ids]]
ds[, motif := factor(motif, levels = rev(rownames(mat_fc)[ids]))]
# ordering
mord <- order(apply(mat_fc[ids,], 1, which.max))
ds[, motif := factor(motif, levels=rev(rownames(mat_fc[ids,])[mord]))]
# limit -log10FDR range
ds[, minuslog10qval := pmin(minuslog10qval, 20)]
# add archetype gene annotations
dict <- fread(file.path(
res_dir,
"motif-archetypes-PPM-PCCnorm-0.8-IC0.5-8bp-mapped.dict"
))
dc <- unique(dict[,.(archetype_name, gene)])
setnames(dc, "archetype_name", "motif")
mt_dt <- merge.data.table(
ds, dc,
by = "motif",
all.x = TRUE
)
# add tf annotations
marks_gold <- fread(
file.path("annotation", "golden-marks-230116.tsv"),
fill = TRUE
)[, 1:2]
setnames(marks_gold, c("gene", "name"))
marks_gold[, name := str_remove(name, "^Nv")]
tfs_annt <- fread(
file.path("annotation", "curated_TFh_Nvec_DToL_names.tsv"),
header = FALSE
)
setnames(tfs_annt, c("gene", "pfam", "og"))
marks_tfs <- merge.data.table(
tfs_annt,
marks_gold,
by = "gene", all = TRUE, sort = FALSE
)
marks_tfs[is.na(marks_tfs)] <- ""
mt_marks_dt <- merge.data.table(
mt_dt, marks_tfs,
by = "gene",
all.x = TRUE
)
mt_marks_dt[is.na(name), name := ""]
mt_marks_dt[name == "" & pfam != "", name := pfam]
mt_marks_dt[name == "", name := motif]
mt_marks_dt[, motif := factor(motif, levels = levels(ds$motif))]
# add labels
mt_marks_dt[, label := str_extract(name, "(?<=BestGuess_).+")]
mt_marks_dt[is.na(label), label := str_extract(name, "(?<=ARCH\\d{1,3}_).+")]
mt_marks_dt[is.na(label), label := name]
mt_marks_dt[!(pval < 1e5 & fc > 4), label := NA]
# filter labels by RNA plot labels
labels_dt <- marks_tfs[name != ""][, .(name, gene)]
labels_v <- structure(labels_dt$name, names = labels_dt$gene)
mt_marks_dt[is.na(label), label := labels_v[gene]]
setorder(mt_marks_dt, -minuslog10qval)
mt_marks_dt[, i:=1:.N, .(gene,motif)]
mt_marks_dt[i > 1, label := NA]
# mt_marks_dt[!(pval < 1e5 & fc > 3), label := NA]
# plot
labels_p <- function(
data,
column_name,
padj_thrs = 1e-3,
fc_thrs = 4,
operation = c("|","&")[1]
) {
return(
function(m) {
dl <- copy(data)
dl[, column_name := dl[[column_name]]]
dl <- dl[order(padj)][, .SD[1], .(column_name)]
dl <- dl[match(m, column_name)]
if (any(c("|", "OR", "or", "union") %in% operation[1])) {
m[dl$padj < padj_thrs | dl$fc > fc_thrs]
} else if (any(c("&", "AND", "and", "intersect") %in% operation[1])) {
m[dl$padj < padj_thrs & dl$fc > fc_thrs]
}
}
)
}
gp <- ggplot(mt_marks_dt, aes(group, motif, label = label)) +
geom_point(
aes(size = minuslog10qval, fill = fc),
shape = 21,
color = "black"
) +
scale_fill_gradientn(
name = "enrichmnet\nfold change",
colours = c("white","#fee5d9","#fcae91","#fb6a4a","#de2d26","#a50f15", "#7a0105"),
breaks = c(0, 2, 4, 6)
) +
scale_size_continuous(
name = "-log10 FDR",
breaks = c(0, 10, 20)
) +
#scale_y_discrete(
# breaks = labels_p(
# data = mt_marks_dt,
# column_name = "motif",
# padj_thrs = 1e-3,
# fc_thrs = 2,
# operation = "&"
# )
#) +
geom_text_repel(
force = 0.5,
nudge_x = -0.25,
direction = "y",
hjust = 1,
segment.size = 0.2
) +
theme(
panel.grid.major = element_line(size = 0.25),
axis.text.y = element_text(size = 8)
)
gp
## Warning: Removed 1598 rows containing missing values (geom_text_repel).
Merge replicates per condition
bam_dir="ATACSEQ/nucleosome_free_regions/bam/"
nth=18
for line in Fox Elav Ncol
do
for cond in pos neg
do
name=${line}_${cond}
echo ${name}
bams=$( echo ${bam_dir}/${name}*bam )
samtools merge -@ ${nth} ${bam_dir}/${name}.ncfree.bam ${bams}
samtools sort -@ ${nth} -o ${bam_dir}/${name}.ncfree.sorted.bam ${bam_dir}/${name}.ncfree.bam
rm ${bam_dir}/${name}.ncfree.bam
samtools index -@ ${nth} ${bam_dir}/${name}.ncfree.sorted.bam
done
done
Footprint score calculation for consensus peaks
conda activate TOBIAS_ENV
bam_dir="ATACSEQ/nucleosome_free_regions/bam/"
out_dir="ATACSEQ/nucleosome_free_regions/footprint/"
mkdir ${out_dir}
peaks="ATACSEQ/nucleosome_free_regions/consensus_peaks/consensusSeekeR-peaks.bed"
genome="genome/Nvec_vc1.1_gDNA.fasta"
nth=12
for line in Fox Elav Ncol
do
for cond in pos neg
do
name=${line}_${cond}
echo $(date) "- Starting ATACorrect for" ${name}
TOBIAS ATACorrect \
--bam ${bam_dir}/${name}.ncfree.sorted.bam \
--genome ${genome} \
--peaks ${peaks} \
--prefix ${name} \
--outdir ${out_dir}/ATACorrect \
--cores ${nth}
echo $(date) "- Starting FootprintScores"
TOBIAS FootprintScores \
--signal ${out_dir}/ATACorrect/${name}_corrected.bw \
--regions ${peaks} \
--fp-min 10 --fp-max 50 \
--output ${out_dir}/${name}_footprints.bw \
--cores ${nth}
done
done
Plot difference in footprints
conda activate TOBIAS_ENV
bam_dir="ATACSEQ/nucleosome_free_regions/bam/"
bwg_dir="ATACSEQ/nucleosome_free_regions/footprint/ATACorrect"
plt_dir="ATACSEQ/nucleosome_free_regions/footprint/plots"
mkdir ${plt_dir}
peaks="ATACSEQ/nucleosome_free_regions/consensus_peaks/consensusSeekeR-peaks.bed"
for line in Fox Elav Ncol
do
TOBIAS PlotAggregate --TFBS ${peaks} \
--signals ${bwg_dir}/${line}_pos_corrected.bw ${bwg_dir}/${line}_neg_corrected.bw \
--output ${plt_dir}/${line}_footprint_comparison_all_peaks.pdf \
--flank 125 \
--share_y both \
--plot_boundaries \
--signal-on-x
done
TOBIAS PlotAggregate --TFBS ${peaks} \
--signals ${bwg_dir}/Elav_pos_corrected.bw ${bwg_dir}/Fox_pos_corrected.bw ${bwg_dir}/Ncol_pos_corrected.bw \
--output ${plt_dir}/pos_footprint_comparison_all_peaks.pdf \
--flank 150 \
--share_y both \
--plot_boundaries \
--signal-on-x
Combine motif scores and footprint scores to determine motif binding.
mts_fn <- file.path(res_dir, "motif-scores-archetypes-mona-annot.tsv")
ftp_fn <- list.files(ftp_dir, pattern = "bw$", full.names = TRUE)
bin_det <- mta_binding_detect(
mts_fn = mts_fn,
ftp_fn = ftp_fn,
pks_smpl = 1e6,
pval = 1e-3,
seed = 1950
)
bnd_dt <- bin_det$motifs_df
bnd_dt[, sample := str_remove(sample, "_footprints")]
bnd_gr <- makeGRangesFromDataFrame(bnd_dt, keep.extra.columns = TRUE)
seqnames(bnd_gr) <- seqnames(BSgenome.jaNemVect1.1.DToL.Assembly)
bnd_gr_r <- GenomicRanges::restrict(bnd_gr)
bnd_dt <- as.data.table(bnd_gr_r)
fwrite(
bnd_dt,
file.path(res_dir, "motif-scores-archetypes-mona-annot-footprints.tsv.gz")
)
bkg_dist <- bin_det$background_dist
bkg_dt <- rbindlist(lapply(names(bkg_dist), function(smp) {
x <- bkg_dist[[smp]]
y <- bnd_dt[sample == smp]$ftp_threshold[1]
z <- str_remove(smp, "_footprint")
data.table(
x = rnorm(1e3, mean = x$estimate[1], sd = x$estimate[2])
)[, threshold := y][, sample := z]
}))
bkg_dt[, reporterline := str_extract(sample, "Elav|Fox|Ncol")]
fwrite(
bkg_dt,
file.path(res_dir, "motif-footprints-background.tsv.gz")
)
Motif hits are labeled as bound or unbouned based on threshold determined from background distribution of footprint scores (p value of normal fit to background = 0.001).
# background footprint scores
bkg_dt <- fread(file.path(res_dir, "motif-footprints-background.tsv.gz"))
gp_bckg <- ggplot(bkg_dt, aes(x, color = reporterline)) +
geom_density() +
scale_color_manual(values = line_cols) +
scale_y_continuous(expand = expansion(mult = c(0,0.01))) +
geom_vline(
data = unique(bkg_dt[,.(threshold, sample, reporterline)]),
aes(xintercept = threshold, color = reporterline),
size = 2, alpha = 0.2
) +
labs(x = "background footprint scores") +
theme(
legend.position = "bottom"
)
# foreground footprint scores
bnd_dt <- fread(file.path(res_dir, "motif-scores-archetypes-mona-annot-footprints.tsv.gz"))
bnd_dt[, reporterline := str_extract(sample, "Elav|Fox|Ncol")]
bnd_lab <- c("TRUE" = "bound", "FALSE" = "unbound")
gp_frwd <- ggplot(bnd_dt, aes(ftp_score, color = reporterline)) +
geom_density() +
scale_color_manual(values = line_cols) +
scale_y_continuous(expand = expansion(mult = c(0,0.01))) +
scale_x_continuous(limits = c(NA, 200), oob = function(x, limits) x) +
geom_vline(
data = unique(bnd_dt[,.(ftp_threshold, sample, reporterline)]),
aes(xintercept = ftp_threshold, color = reporterline),
size = 2, alpha = 0.2
) +
labs(x = "foreground footprint scores") +
theme(
legend.position = "bottom"
) +
facet_grid(bound ~ ., scales = "free_y", labeller = as_labeller(bnd_lab))
gp_bnd <- (gp_bckg / gp_frwd & theme(legend.position = "bottom")) +
plot_layout(guides = "collect", heights = c(1, 2))
gp_bnd
Save bed files for bound and unbound sites
bnd_dt <- fread(file.path(res_dir, "motif-scores-archetypes-mona-annot-footprints.tsv.gz"))
bnd_dt[, reporterline := str_extract(sample, "Elav|Fox|Ncol")]
bin_dir <- file.path(ftp_dir, "BINDetect")
dir.create(bin_dir, showWarnings = FALSE)
bed_cols <- c("seqnames", "start", "end", "motif", "ftp_score", "strand")
for (smp in unique(bnd_dt$sample)) {
message(smp)
bnd_smp <- bnd_dt[sample == smp]
dir.create(file.path(bin_dir, smp), showWarnings = FALSE)
# all motif sites
fwrite(
unique(bnd_smp[, ..bed_cols]),
file.path(bin_dir, smp, sprintf("%s_all.bed", smp)),
col.names = FALSE, sep ="\t"
)
# all bound motif sites
fwrite(
unique(bnd_smp[bound == TRUE][, ..bed_cols]),
file.path(bin_dir, smp, sprintf("%s_all_bound.bed", smp)),
col.names = FALSE, sep ="\t"
)
# all unbound motif sites
fwrite(
unique(bnd_smp[bound == FALSE][, ..bed_cols]),
file.path(bin_dir, smp, sprintf("%s_all_unbound.bed", smp)),
col.names = FALSE, sep ="\t"
)
for (mot in unique(bnd_smp$motif)) {
# message(mot)
bmt_dt <- bnd_smp[motif == mot]
mot <- str_replace_all(mot, c("/" = "_"))
dir.create(file.path(bin_dir, smp, mot), showWarnings = FALSE)
# all motif sites
fwrite(
unique(bmt_dt[, ..bed_cols]),
file.path(bin_dir, smp, mot, sprintf("%s_%s_all.bed", smp, mot)),
col.names = FALSE, sep ="\t"
)
# all bound motif sites
fwrite(
unique(bmt_dt[bound == TRUE][, ..bed_cols]),
file.path(bin_dir, smp, mot, sprintf("%s_%s_all_bound.bed", smp, mot)),
col.names = FALSE, sep ="\t"
)
# all unbound motif sites
fwrite(
unique(bmt_dt[bound == FALSE][, ..bed_cols]),
file.path(bin_dir, smp, mot, sprintf("%s_%s_all_unbound.bed", smp, mot)),
col.names = FALSE, sep ="\t"
)
}
}
Plot difference in footprint for bound and unbound sites
conda activate TOBIAS_ENV
bam_dir="ATACSEQ/nucleosome_free_regions/bam/"
bwg_dir="ATACSEQ/nucleosome_free_regions/footprint/ATACorrect"
bed_dir="ATACSEQ/nucleosome_free_regions/footprint/BINDetect"
plt_dir="ATACSEQ/nucleosome_free_regions/footprint/plots"
mkdir ${plt_dir}
cond="pos"
for line in Elav Fox Ncol
do
TOBIAS PlotAggregate \
--TFBS ${bed_dir}/${line}_${cond}/${line}_${cond}_all.bed \
${bed_dir}/${line}_${cond}/${line}_${cond}_all_bound.bed \
${bed_dir}/${line}_${cond}/${line}_${cond}_all_unbound.bed \
--signals \
${bwg_dir}/${line}_pos_corrected.bw \
${bwg_dir}/${line}_neg_corrected.bw \
--output ${plt_dir}/${line}_footprint_comparison_bound_peaks.pdf \
--flank 125 \
--share_y sites \
--plot_boundaries
done
Plot the same for selected motifs
conda activate TOBIAS_ENV
bam_dir="ATACSEQ/nucleosome_free_regions/bam/"
bwg_dir="ATACSEQ/nucleosome_free_regions/footprint/ATACorrect"
bed_dir="ATACSEQ/nucleosome_free_regions/footprint/BINDetect"
plt_dir="ATACSEQ/nucleosome_free_regions/footprint/plots"
mkdir ${plt_dir}
mot="ARCH551_POU4"
cond="pos"
line1="Elav"
line2="Fox"
line3="Ncol"
TOBIAS PlotAggregate \
--TFBS \
${bed_dir}/${line}_${cond}/${mot}/${line}_${cond}_${mot}_all_bound.bed \
${bed_dir}/${line}_${cond}/${mot}/${line}_${cond}_${mot}_all_unbound.bed \
--signals \
${bwg_dir}/${line1}_pos_corrected.bw \
${bwg_dir}/${line2}_pos_corrected.bw \
${bwg_dir}/${line3}_pos_corrected.bw \
--output ${plt_dir}/${line}_${mot}_footprint_comparison_bound_peaks.pdf \
--flank 125 \
--share_y sites \
--plot_boundaries